// .........................................................................
// Title: industry_morningstar.do
//
// Compiles list of the modal industry assignments for each security within 
// the Morningstar, as classified by each of the individual reporting funds; 
// looks for modal assignments within funds and then across funds
// .........................................................................

cap mkdir $tmp/industry

* Append monthly files and keep only relevant variables
forvalues x=2005/2020 {
	di "`x'" 
	append using "$morningstar_hd/HD_`x'_m.dta", keep(cusip6 gicsindustryid MasterPortfolioId)
	desc
} 

* Adjust MS classificaions
* MS uses the old classifications that has real estate
gen gics6 = substr(gicsindustryid,1,6)
replace gics6="601010" if gics6=="404010" | gics6=="404020"
replace gics6="601020" if gics6=="404030"
* Semi-conductors
replace gics6="453010" if gics6=="452050"

drop if cusip6=="" | gics6==""

* Find fund-specific modal industry assigned to each CUSIP
gen counter = 1 if !missing(gics6)
bysort cusip6 gics6 MasterPort: egen industry_fund_count=sum(counter)
collapse (firstnm) industry_fund_count, by(cusip6 gics6 MasterPort) fast
bysort cusip6 MasterPort: egen industry_fund_count_max=max(industry_fund_count)
drop if industry_fund_count<industry_fund_count_max

* Split the equally frequent
gen counter = 1 if !missing(gics6)
bysort cusip6 MasterPort: egen industry_fund_count_split=sum(counter)
drop counter
bysort cusip6 MasterPort: gen randt=runiform()
bysort cusip6 MasterPort: egen randt_max=max(randt)
drop if industry_fund_count_split>=2 & randt<randt_max
drop randt randt_max

* Find modal industry assigned to each CUSIP across funds
gen counter = 1 if !missing(gics6)
bysort cusip6 gics6: egen industry_count=sum(counter)
collapse (firstnm) industry_count, by(cusip6 gics6) fast
bysort cusip6: egen industry_count_max=max(industry_count)
drop if industry_count<industry_count_max

* Split the equally frequent
gen counter = 1 if !missing(gics6)
bysort cusip6: egen industry_count_split=sum(count)
drop counter
bysort cusip6: gen randt=runiform()
bysort cusip6: egen randt_max=max(randt)
drop if industry_count_split>=2 & randt<randt_max

* Save modal industry assignments
keep cusip6 gics6
save "$tmp/Internal_Industry_NonUS_US.dta", replace

